This repository contains the codebase for our paper, "Rethinking Reward Models for Multi-Domain Test-Time Scaling."
conda create -n multi-rm python=3.10.14
conda activate multi-rm
pip install -r requirements.txt
pip install flash-attn --no-build-isolation
# TASK_TYPE can be one of:
# gORM / gPRM
TASK_TYPE=[choose_one_above]
# generate data
python -m data_generation.generate_data \
--output_dir [OUTPUT_DIR] \
--task_type ${TASK_TYPE}
# preprocess data
python -m data_generation.preprocess_data \
--output_dir [OUTPUT_DIR] \
--task_type ${TASK_TYPE}
# shorten critique (optional)
python -m data_generation.shorten_critique \
--output_dir [OUTPUT_DIR] \
--task_type ${TASK_TYPE}
# Training dORM / dPRM
# Use the appropriate config file:
# ./configs/dORM-14B.yaml
# ./configs/dPRM-14B.yaml
# ./configs/dORM-8B.yaml
# ./configs/dPRM-8B.yaml
accelerate launch -m discriminative.train \
--config ./configs/dORM-14B.yaml \
--output_dir ./[TRAINING_RESULTS]/dORM-14B \
--per_device_batch_size 4 \
--category all
# Training gORM / gPRM
# Use the appropriate config file:
# ./configs/gORM-14B.yaml
# ./configs/gPRM-14B.yaml
# ./configs/gORM-8B.yaml
# ./configs/gPRM-8B.yaml
accelerate launch -m generative.train \
--config ./configs/gORM-14B.yaml \
--output_dir ./[TRAINING_RESULTS]/gORM-14B \
--per_device_batch_size 4 \
--category all
# TEST can be one of:
# test (CoTs generated by Llama3.1-8B-Instruct)
# test_smollm (CoTs generated by Smollm3)
# test_qwen (CoTs generated by Qwen2.5-7B-Instruct)
# test_gemma (CoTs generated by gemma2-9B-it)
# test_llama (CoTs generated by Llama3.1-80B-Instruct)
# Inference for dORM / dPRM
# Use the appropriate model checkpoint:
# dongboklee/dORM-14B
# dongboklee/dPRM-14B
# or use your own trained models
python -m discriminative.get_reward \
--data_path dongboklee/[TEST] \
--model_id dongboklee/dORM-14B \
--output_dir ./[REWARD_RESULTS]/dORM-14B-[TEST] \
--per_device_batch_size 8 \
--category all
# Inference for gORM / gPRM
# Use the appropriate model checkpoint:
# dongboklee/gORM-14B-merged, TASK_TYPE=gORM
# dongboklee/gPRM-14B-merged, TASK_TYPE=gPRM
python -m generative.get_reward \
--data_path dongboklee/[TEST] \
--model_id dongboklee/gORM-14B-merged \
--output_dir ./[REWARD_RESULTS]/gORM-14B-[TEST] \
--task_type gORM \
--category all
# Inference for gORM / gPRM (for your own trained models)
# Use the appropriate model checkpoint:
# [LOCAL_DIR]/gORM-14B, TASK_TYPE=gORM
# [LOCAL_DIR]/gPRM-14B, TASK_TYPE=gPRM
# Merge LoRA for vLLM inference
python -m generative.merge_lora \
--input_dir [LOCAL_DIR]/gORM-14B # saved to [LOCAL_DIR]/gORM-14B/tmp
python -m generative.get_reward \
--data_path dongboklee/[TEST] \
--model_id [LOCAL_DIR]/gORM-14B/tmp \
--output_dir ./[REWARD_RESULTS]/gORM-14B-[TEST] \
--task_type gORM \
--category all
# TEST can be one of:
# test (CoTs generated by Llama3.1-8B-Instruct)
# test_smollm (CoTs generated by Smollm3)
# test_qwen (CoTs generated by Qwen2.5-7B-Instruct)
# test_gemma (CoTs generated by gemma2-9B-it)
# test_llama (CoTs generated by Llama3.1-80B-Instruct)
TEST=[choose_one_above]
# Or use your own reward dirs instead of HF hubs:
# [MODEL_NAME]/[TEST]/[CATEGORY]_reward.json
python -m evaluation.evaluate \
--data_path dongboklee/${TEST} \
--output_dir [OUTPUT_DIR] \
--reward_dirs \
dongboklee/dORM-14B-${TEST} \
dongboklee/dPRM-14B-${TEST} \
dongboklee/gORM-14B-${TEST} \
dongboklee/gPRM-14B-${TEST} \
--model_names dORM-14B dPRM-14B gORM-14B gPRM-14B \
--strategies last min mean mean \
--num_runs 100
# CSV_FILE can be one of:
# [OUTPUT_DIR_FROM_ABOVE]/best_of_n.csv
# [OUTPUT_DIR_FROM_ABOVE]/weighted_vote.csv
CSV_FILE=[choose_one_above]
# [OUTPUT_FILE_PREFIX]=example
# -> example_legend.png / example_legend.pdf
# -> example.png / example.pdf
python -m evaluation.plot \
--input_file ${CSV_FILE} \
--output_file [OUTPUT_FILE_PREFIX]
Please find the assets of this repo below, including training and test datasets, model checkpoints, and rewards obtained by the four reward model variants.
Name | Description |
---|---|
train | multi-domain training dataset for dORM/dPRM (mostly adapted from VersaPRM). |
train_gORM | multi-domain training dataset for gORM generated by QwQ-32B. |
train_gPRM | multi-domain training dataset for gPRM generated by QwQ-32B. |
test | multi-domain test dataset with CoTs (N=128) generated by Llama3.1-8B-Instruct (mostly adapted from VersaPRM). |
test_smollm | multi-domain test dataset with CoTs (N=16) generated by SmolLM3-3B. |
test_qwen | multi-domain test dataset with CoTs (N=16) generated by Qwen2.5-7B-Instruct. |
test_gemma | multi-domain test dataset with CoTs (N=16) generated by gemma-2-9b-it. |
test_llama | multi-domain test dataset with CoTs (N=16) generated by Llama-3.1-70B-Instruct. |
Name | Backbone | Trained On | LoRA-merged version |
---|---|---|---|
dORM-14B | 14B backbone | train | — |
dPRM-14B | 14B backbone | train | — |
gORM-14B | 14B backbone | train_gORM | gORM-14B-merged |
gPRM-14B | 14B backbone | train_gPRM | gPRM-14B-merged |
dORM-8B | 8B backbone | train | — |
dPRM-8B | 8B backbone | train | — |
gORM-8B | 8B backbone | train_gORM | gORM-8B-merged |
gPRM-8B | 8B backbone | train_gPRM | gPRM-8B-merged |
@article{multi-rm,
title = {Rethinking Reward Models for Multi-Domain Test-Time Scaling},
author = {Lee, Dong Bok and Lee, Seanie and Park, Sangwoo and Kang, Minki and Baek, Jinheon and Kim, Dongki and Wagner, Dominik and Jin, Jiongdao and Lee, Heejun and Bocklet, Tobias and Wang, Jinyu and Fu, Jingjing and Hwang, Sung Ju and Bian, Jiang and Song, Lei},
journal = {arXiv preprint arXiv:2510.00492},
year = {2025}
}